Effect of Modifying Individual Parameters for Random Forrest

This document looks at the effect that modifying the values of individial parameters to the Random Forrest algorithm in sklearn has on its accuracy and time taken to train the model.

Best results were when we used the following settings:

sample_size=9000 
random_state=303

and all the other arguments left to their defaults. This resulted in an out of sample accuracy of 0.963111111111

If we were to keep the sample size at the default value of 1000, then the greatest results were obtained when the following settings were used:

random_state: 3599

And all the other arguments left to their defaults. This resulted in an out of sample accuracy: 0.944

In [1]:
import numpy as np
In [2]:
from rf_tester import *
from parameter_plots import *
In [3]:
from prep_terrain_data import makeTerrainData
In [4]:
from bokeh.io import output_notebook
output_notebook()
BokehJS successfully loaded.

Default Parameters

In [63]:
data_generator = makeTerrainData
sample_size = 1000
random_state = 3599

default_n_estimators = 10
default_criterion = "gini"
default_max_depth = None
default_min_samples_split = 2
default_min_samples_leaf = 1
default_min_weight_fraction_leaf = 0.0
default_max_features = 'auto'
default_max_leaf_nodes = None
default_bootstrap = True
default_oob_score = False
default_warm_start = False
default_class_weight = None

Seed

In [61]:
seeds = range(1,10000)
seed_results = loop_rf(data_generator,
            sample_size=sample_size,
            n_estimators = default_n_estimators,
            criterion = default_criterion,
            max_depth = default_max_depth,
            min_samples_split = default_min_samples_split,
            min_samples_leaf = default_min_samples_leaf,
            min_weight_fraction_leaf = default_min_weight_fraction_leaf,
            max_features = default_max_features,
            max_leaf_nodes = default_max_leaf_nodes,
            bootstrap = default_bootstrap,
            oob_score = default_oob_score,
            random_state = seeds,
            warm_start = default_warm_start,
            class_weight = default_class_weight
            )
---------------------------------------- 
Best Results 
----------------------------------------
random_state: 3599 
Out of sample accuracy: 0.944 
----------------------------------------
In [62]:
parameter_plots(seeds, results_dict=seed_results, 
                x_label="Seed #", 
                title_accuracy="Seed # vs Accuracy", 
                title_time="Seed # vs Training Time",
                legend_pos="left_center")

Sample Size

In [65]:
sample_sizes = range(1000, 30000+1, 1000) + \
               range(35000, 60000+1, 5000) + \
               range(70000, 100000+1, 10000) + \
               [150000, 200000]
sample_sizes_results = loop_rf(data_generator,
            sample_size=sample_sizes,
            n_estimators = default_n_estimators,
            criterion = default_criterion,
            max_depth = default_max_depth,
            min_samples_split = default_min_samples_split,
            min_samples_leaf = default_min_samples_leaf,
            min_weight_fraction_leaf = default_min_weight_fraction_leaf,
            max_features = default_max_features,
            max_leaf_nodes = default_max_leaf_nodes,
            bootstrap = default_bootstrap,
            oob_score = default_oob_score,
            random_state = 303,
            warm_start = default_warm_start,
            class_weight = default_class_weight
            )
---------------------------------------- 
Best Results 
----------------------------------------
sample_size: 9000 
Out of sample accuracy: 0.963111111111 
----------------------------------------
In [66]:
scaled_sample_sizes = np.array(sample_sizes) / 1000.0
parameter_plots(scaled_sample_sizes, results_dict=sample_sizes_results, 
                x_label="Sample Size (in thousands)", 
                title_accuracy="Sample Size vs Accuracy", 
                title_time="Sample Size vs Training Time",
                legend_pos="bottom_right")

Criterion

In [67]:
criteria = ["gini", "entropy"]
criteria_results = loop_rf(data_generator,
            sample_size=sample_size,
            n_estimators = default_n_estimators,
            criterion = criteria,
            max_depth = default_max_depth,
            min_samples_split = default_min_samples_split,
            min_samples_leaf = default_min_samples_leaf,
            min_weight_fraction_leaf = default_min_weight_fraction_leaf,
            max_features = default_max_features,
            max_leaf_nodes = default_max_leaf_nodes,
            bootstrap = default_bootstrap,
            oob_score = default_oob_score,
            random_state = random_state,
            warm_start = default_warm_start,
            class_weight = default_class_weight
            )
---------------------------------------- 
Best Results 
----------------------------------------
criterion: gini 
Out of sample accuracy: 0.944 
----------------------------------------
In [ ]:
 

Max Depth

In [68]:
max_depths = range(1,30) + range(35,60,5) + range(70, 200, 10)
max_depths_results = loop_rf(data_generator,
            sample_size=sample_size,
            n_estimators = default_n_estimators,
            criterion = default_criterion,
            max_depth = max_depths,
            min_samples_split = default_min_samples_split,
            min_samples_leaf = default_min_samples_leaf,
            min_weight_fraction_leaf = default_min_weight_fraction_leaf,
            max_features = default_max_features,
            max_leaf_nodes = default_max_leaf_nodes,
            bootstrap = default_bootstrap,
            oob_score = default_oob_score,
            random_state = random_state,
            warm_start = default_warm_start,
            class_weight = default_class_weight
            )
---------------------------------------- 
Best Results 
----------------------------------------
max_depth: 9 
Out of sample accuracy: 0.944 
----------------------------------------
In [69]:
parameter_plots(max_depths, results_dict=max_depths_results, 
                x_label="Max Depth", 
                title_accuracy="Max Depth vs Accuracy", 
                title_time="Max Depth vs Training Time",
                legend_pos="bottom_right")

Min Samples Split

In [70]:
min_samples_splits = range(2, 100)
min_samples_split_results = loop_rf(data_generator,
            sample_size=sample_size,
            n_estimators = default_n_estimators,
            criterion = default_criterion,
            max_depth = default_max_depth,
            min_samples_split = min_samples_splits,
            min_samples_leaf = default_min_samples_leaf,
            min_weight_fraction_leaf = default_min_weight_fraction_leaf,
            max_features = default_max_features,
            max_leaf_nodes = default_max_leaf_nodes,
            bootstrap = default_bootstrap,
            oob_score = default_oob_score,
            random_state = random_state,
            warm_start = default_warm_start,
            class_weight = default_class_weight
            )
---------------------------------------- 
Best Results 
----------------------------------------
min_samples_split: 2 
Out of sample accuracy: 0.944 
----------------------------------------
In [71]:
parameter_plots(min_samples_splits, results_dict=min_samples_split_results, 
                x_label="min samples splits", 
                title_accuracy="min samples splits vs Accuracy", 
                title_time="min samples splits vs Training Time",
                legend_pos="top_right")

min_samples_leaf

In [72]:
min_samples_leafs = range(1, 100)
min_samples_leaf_results = loop_rf(data_generator,
            sample_size=sample_size,
            n_estimators = default_n_estimators,
            criterion = default_criterion,
            max_depth = default_max_depth,
            min_samples_split = default_min_samples_split,
            min_samples_leaf = min_samples_leafs,
            min_weight_fraction_leaf = default_min_weight_fraction_leaf,
            max_features = default_max_features,
            max_leaf_nodes = default_max_leaf_nodes,
            bootstrap = default_bootstrap,
            oob_score = default_oob_score,
            random_state = random_state,
            warm_start = default_warm_start,
            class_weight = default_class_weight
            )
---------------------------------------- 
Best Results 
----------------------------------------
min_samples_leaf: 1 
Out of sample accuracy: 0.944 
----------------------------------------
In [73]:
parameter_plots(min_samples_leafs, results_dict=min_samples_leaf_results, 
                x_label="min_samples_leafs", 
                title_accuracy="min_samples_leafs vs Accuracy", 
                title_time="min_samples_leafs vs Training Time",
                legend_pos="top_right")

Max Leaf Nodes

In [74]:
max_leaf_nodes = range(2,300)
max_leaf_nodes_results = loop_rf(data_generator,
            sample_size=sample_size,
            n_estimators = default_n_estimators,
            criterion = default_criterion,
            max_depth = default_max_depth,
            min_samples_split = default_min_samples_split,
            min_samples_leaf = default_min_samples_leaf,
            min_weight_fraction_leaf = default_min_weight_fraction_leaf,
            max_features = default_max_features,
            max_leaf_nodes = max_leaf_nodes,
            bootstrap = default_bootstrap,
            oob_score = default_oob_score,
            random_state = random_state,
            warm_start = default_warm_start,
            class_weight = default_class_weight
            )
---------------------------------------- 
Best Results 
----------------------------------------
max_leaf_nodes: 17 
Out of sample accuracy: 0.932 
----------------------------------------
In [75]:
parameter_plots(max_leaf_nodes, results_dict=max_leaf_nodes_results, 
                x_label="max_leaf_nodes", 
                title_accuracy="max_leaf_nodes vs Accuracy", 
                title_time="max_leaf_nodes vs Training Time",
                legend_pos="top_right")

Bootstrap

In [76]:
bootstraps = [False, True]
bootstrap_results = loop_rf(data_generator,
            sample_size=sample_size,
            n_estimators = default_n_estimators,
            criterion = default_criterion,
            max_depth = default_max_depth,
            min_samples_split = default_min_samples_split,
            min_samples_leaf = default_min_samples_leaf,
            min_weight_fraction_leaf = default_min_weight_fraction_leaf,
            max_features = default_max_features,
            max_leaf_nodes = default_max_leaf_nodes,
            bootstrap = bootstraps,
            oob_score = default_oob_score,
            random_state = random_state,
            warm_start = default_warm_start,
            class_weight = default_class_weight
            )
---------------------------------------- 
Best Results 
----------------------------------------
bootstrap: True 
Out of sample accuracy: 0.944 
----------------------------------------
In [77]:
parameter_plots(bootstraps, results_dict=bootstrap_results, 
                x_label="bootstrap", 
                title_accuracy="bootstrap vs Accuracy", 
                title_time="bootstrap vs Training Time",
                legend_pos="left_center")

OOB

In [78]:
oobs = [False, True]
oob_results = loop_rf(data_generator,
            sample_size=sample_size,
            n_estimators = default_n_estimators,
            criterion = default_criterion,
            max_depth = default_max_depth,
            min_samples_split = default_min_samples_split,
            min_samples_leaf = default_min_samples_leaf,
            min_weight_fraction_leaf = default_min_weight_fraction_leaf,
            max_features = default_max_features,
            max_leaf_nodes = default_max_leaf_nodes,
            bootstrap = default_bootstrap,
            oob_score = oobs,
            random_state = random_state,
            warm_start = default_warm_start,
            class_weight = default_class_weight
            )
---------------------------------------- 
Best Results 
----------------------------------------
oob_score: False 
Out of sample accuracy: 0.944 
----------------------------------------
In [79]:
parameter_plots(oobs, results_dict=oob_results, 
                x_label="Using Out of bag Error Estimate", 
                title_accuracy="Using Out of bag Error Estimate vs Accuracy", 
                title_time="Using Out of bag Error Estimate vs Training Time",
                legend_pos="top_right")

Warm Start

In [80]:
warm_starts = [False, True]
warm_start_results = loop_rf(data_generator,
            sample_size=sample_size,
            n_estimators = default_n_estimators,
            criterion = default_criterion,
            max_depth = default_max_depth,
            min_samples_split = default_min_samples_split,
            min_samples_leaf = default_min_samples_leaf,
            min_weight_fraction_leaf = default_min_weight_fraction_leaf,
            max_features = default_max_features,
            max_leaf_nodes = default_max_leaf_nodes,
            bootstrap = default_bootstrap,
            oob_score = default_oob_score,
            random_state = random_state,
            warm_start = warm_starts,
            class_weight = default_class_weight
            )
---------------------------------------- 
Best Results 
----------------------------------------
warm_start: False 
Out of sample accuracy: 0.944 
----------------------------------------
In [81]:
parameter_plots(warm_starts, results_dict=warm_start_results, 
                x_label="Warm Start", 
                title_accuracy="Warm Start vs Accuracy", 
                title_time="Warm Start vs Training Time",
                legend_pos="right_center")

class_weight

In [82]:
class_weights = ["balanced", "balanced_subsample", None]
class_weight_results = loop_rf(data_generator,
            sample_size=sample_size,
            n_estimators = default_n_estimators,
            criterion = default_criterion,
            max_depth = default_max_depth,
            min_samples_split = default_min_samples_split,
            min_samples_leaf = default_min_samples_leaf,
            min_weight_fraction_leaf = default_min_weight_fraction_leaf,
            max_features = default_max_features,
            max_leaf_nodes = default_max_leaf_nodes,
            bootstrap = default_bootstrap,
            oob_score = default_oob_score,
            random_state = random_state,
            warm_start = default_warm_start,
            class_weight = class_weights
            )
---------------------------------------- 
Best Results 
----------------------------------------
class_weight: balanced 
Out of sample accuracy: 0.944 
----------------------------------------
In [ ]: